/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.util; import java.io.*; import java.util.*; import net.nutch.db.*; /************************************************************* * When we generate a fetchlist, we need to choose a "cutoff" * score, such that any scores above that cutoff will be included * in the fetchlist. Any scores below will not be. (It is too * hard to do the obvious thing, which is to sort the list of all * pages by score, and pick the top K.) * * We need a good way to choose that cutoff. ScoreStats is used * during LinkAnalysis to track the distribution of scores that * we compute. We bucketize the scorespace into 2000 buckets. * the first 1000 are equally-spaced counts for the range 0..1.0 * (non-inclusive). The 2nd buckets are logarithmically spaced * between 1 and Float.MAX_VALUE. * * If the score is < 1, then choose a bucket by (score / 1000) and * choosing the incrementing the resulting slot. * * If the score is >1, then take the base-10 log, and take the * integer floor. This should be an int no greater than 9. This * is the hundreds-place digit for the index. (Since '1' is in * the thousands-place.) Next, find where the score appears in * the range between floor(log(score)), and ceiling(log(score)). * The percentage of the distance between these two values is * reflected in the final two digits for the index. * * @author Mike Cafarella ***************************************************************/ public class ScoreStats { private final static double INVERTED_LOG_BASE_TEN = (1.0 / Math.log(10)); private final static double EXP_127_MODIFIER = (1000.0 / (Math.log(Float.MAX_VALUE) * INVERTED_LOG_BASE_TEN)); private final static double RANGE_COMPRESSOR = INVERTED_LOG_BASE_TEN * EXP_127_MODIFIER; long totalScores = 0; // // For bucketizing score counts // long buckets[] = new long[2001]; /** */ public ScoreStats() { } /** * Increment the counter in the right place. We keep * 2000 different buckets. Half of them are <1, and * half are >1. * Dies when it tries to fill bucket "1132" */ public void addScore(float score) { if (score < 1) { int index = (int) Math.floor(score * 1000); buckets[index]++; } else { // Here we need to find the floor'ed base-10 logarithm. int index = (int) Math.floor(Math.log(score) * RANGE_COMPRESSOR); index += 1000; buckets[index]++; } totalScores++; } /** * Print out the distribution, with greater specificity * for percentiles 90th - 100th. */ public void emitDistribution(PrintStream pout) { pout.println("***** Estimated Score Distribution *****"); pout.println(" (to choose a fetchlist cutoff score)"); pout.println(); // Figure out how big each percentile chunk is. double decileChunk = totalScores / 10.0; double percentileChunk = totalScores / 100.0; // Now, emit everything double grandTotal = 0, minScore = Double.MAX_VALUE, maxScore = Double.MIN_VALUE; long scoresSoFar = 0; int decileCount = 0, percentileCount = 0; // Go through all the sample buckets for (int i = 0; i < buckets.length; i++) { // // Always increment the // seen-sample counter by the number of samples // in the current bucket. // scoresSoFar += buckets[i]; // From the bucket index, recreate the // original score (as best we can) double reconstructedValue = 0.0; if (i < 1000) { reconstructedValue = i / 1000.0; } else { int localIndex = i - 1000; reconstructedValue = Math.exp(localIndex / RANGE_COMPRESSOR); } // Keep running stats on min, max, avg scores grandTotal += (reconstructedValue * buckets[i]); if (buckets[i] > 0) { if (minScore > reconstructedValue) { minScore = reconstructedValue; } if (maxScore < reconstructedValue) { maxScore = reconstructedValue; } } // // If the number of samples we've seen so far is // GTE the predicted percentile break, then we want to // emit a println(). // if (scoresSoFar >= ((decileCount * decileChunk) + (percentileCount * percentileChunk))) { // Compute what percentile of the items // we've reached double precisePercentile = ((int) Math.round(((totalScores - scoresSoFar) / (totalScores * 1.0)) * 10000)) / 100.0; // Emit String equalityOperator = ">="; if ((totalScores - scoresSoFar) == 0) { equalityOperator = ">"; } pout.println(precisePercentile + "% (" + (totalScores - scoresSoFar) + ") have score " + equalityOperator + " " + reconstructedValue); // Bump our decile and percentile counters. // We may have to bump multiple times if // a single bucket carried us across several // boundaries. while (decileCount < 9 && scoresSoFar >= (decileCount * decileChunk) + (percentileCount * percentileChunk)) { decileCount++; } if (decileCount >= 9) { while (percentileCount < 10 && scoresSoFar >= (decileCount * decileChunk) + (percentileCount * percentileChunk)) { percentileCount++; } } // If we've reached the top percentile, then we're done! if (percentileCount >= 10) { break; } } } pout.println(); pout.println(); pout.println("Min score is " + minScore); pout.println("Max score is " + maxScore); pout.println("Average score is " + (grandTotal / scoresSoFar)); } /** */ public static void main(String argv[]) throws IOException { if (argv.length < 1) { System.out.println("Usage: java net.nutch.util.ScoreStats [-real <db>] [-simulated <numScores> <min> <max> [seed]]"); return; } File dbFile = null; long seed = new Random().nextLong(); boolean simulated = false; int numScores = 0; float min = 0, max = 0; if ("-real".equals(argv[0])) { dbFile = new File(argv[1]); } else if ("-simulated".equals(argv[0])) { simulated = true; numScores = Integer.parseInt(argv[1]); min = Float.parseFloat(argv[2]); max = Float.parseFloat(argv[3]); if (argv.length > 4) { seed = Long.parseLong(argv[4]); } } else { System.out.println("No command specified"); } System.out.println("Using seed: " + seed); ScoreStats ss = new ScoreStats(); if (simulated) { Random r = new Random(seed); for (int i = 0; i < numScores; i++) { float newScore = min + (r.nextFloat() * (max - min)); ss.addScore(newScore); } } else { IWebDBReader reader = new WebDBReader(dbFile); try { for (Enumeration e = reader.pages(); e.hasMoreElements(); ) { Page p = (Page) e.nextElement(); ss.addScore(p.getScore()); } } finally { reader.close(); } } ss.emitDistribution(System.out); } }